
*********************************************************************************
* Getting the MEPS data ready for analysis 
*********************************************************************************
* the main thing is to reshape the data so that we only have one line for each ID. 
* because each individual is followed for 2 years (more in panels 23 and 24), the initial file contains 2 lines per ID. 


clear
use "${data}\Consolidated_data_merged.dta"

* Basic Checks on the data
qui{
set more off
sort ID PANEL
replace PANEL=1 if YEAR==1996
***people with the same ID should be the same individuals-- check that they have the same bday (year)
sort  ID PANEL 
format ID %16.0g
replace ID=ID+PANEL*10000000000 
sort ID DOBYY
gen alarm=1 if ID==ID[_n-1]  &  DOBYY!=DOBYY[_n-1] 
gen alarm1=1 if ID==ID[_n+1]  &  DOBYY!=DOBYY[_n+1] 
tab alarm
tab alarm1
order ID PANEL

tab AGEX if alarm==1
tab AGE31X if alarm==1
tab AGEX if alarm1==1
tab WEIGHT if alarm==1
 
drop if alarm==1 //it looks like all these people are mainly 85 or 90. - we do not use these anyway, will drop anyone aged 84 and above later
drop alarm alarm1


***people with the same ID should be part of the same panel: check that this is the case
gen test=.
replace test=1 if ID==ID[_n-1] & PANEL!=PANEL[_n-1]
replace test=1 if ID==ID[_n+1] & PANEL!=PANEL[_n+1]
sort test ID PANEL
tab test
drop if test==1
*0 observations deleted, all good
drop test

/*
*checking that we always have only at most 2 observations per ID - yes, in the panels we are working with, this is all good

sort ID
gen check=1
replace check=1+check[_n-1] if ID==ID[_n-1]
bysort ID: egen max_check=max(check)
drop check
tab  max_check
sort max_check ID AGEX
tab PANEL if max_check==3 // THIS ARISES ONLY FOR PANEL 23
tab PANEL if max_check==1 // THIS ARISES ONLY FOR PANEL 23

tab PANEL
//Panels 23 and 24 were extended to nine rounds (four years) of data collection as opposed to the historical five rounds (two years)

* for now, let's just drop the extra rounds for PANEL 23 - come back to this if we end up wanting to use this data. 
drop max_check
sort ID YEAR
gen check=1
replace check=1+check[_n-1] if ID==ID[_n-1]
tab YEAR if check ==3 // 8,703 observations for year 2020
drop if check==3
*/
}


***KEEP THE VARIABLES WE NEED ***


qui{
#delimit ;
global varkeep_emp EMPST EMPST31 EMPST42 EMPST53 SELFCM31 SELFCM42
	SELFCM53 SELFCM HRWG31X HRWG42X HRWG53X HRWG1X HRWG2X HRWGX NHRWG31
	NHRWG42 NHRWG53 HELD1X HELD2X HELDX HELD31X HELD42X HELD53X HOUR31
	HOUR42 HOUR53 HOUR INDCAT31 INDCAT42 INDCAT53 OCCCAT31 OCCCAT42 OCCCAT53 COCCP31 COCCP42 COCCP53
	STJBDD STJBMM STJBYY STJBDD31 STJBMM31 STJBYY31 STJBDD42 STJBMM42
	STJBYY42 STJBDD53 STJBMM53 STJBYY53 NWK NWK31 NWK42 NWK53 MORJOB
	MORJOB31 MORJOB42 MORJOB53
	TEMPJB31 TEMPJB42 TEMPJB53
	OFFER* OFFER31X  OFFER42X OFFER53X
	CHOIC* CHOIC31 CHOIC42 CHOIC53
	INDCAT31 INDCAT42 INDCAT53
	CIND31 CIND42 CIND53
	OCCCAT31 OCCCAT42  OCCCAT53
	UNION31 UNION42  UNION53
	PRIVAT31 PRIVAT42 PRIVAT53
	PRING31 PRING42 PRING53
	PRIV31 PRIV42 PRIV53
	PRIEU31 PRIEU42 PRIEU53
	PRIS31 PRIS42 PRIS53
	PRIOG31 PRIOG42 PRIOG53
	PRIDK31 PRIDK42 PRIDK53
	PROUT31 PROUT42 PROUT53
	PRSTX31 PRSTX42 PRSTX53 
	PUB31X PUB42X PUB53X
	MCAID31X  MCAID42X MCAID53X
	MCARE31X  MCARE42X MCARE53X
	DDNWRK31 DDNWRK42 DDNWRK53
	NUMEMP31 NUMEMP42 NUMEMP53;
global varkeep_SAQhealth ADSMOK42 ADNSMK42 ADDSMK42 ADGENH42 PCS42 MCS42 K6SUM42 
	EQU42 ADCMPM42 ADCMPD42 ADCMPY42;	
global varkeep_preventive_care DENTCHK* BLDPCHK* CHOLCHK* PHYSICL* FLUSHOT* WEARDEN* LOSTEET* BPCHEK* 
	 ASPRIN* NOASPR* STOMCH* BPMONT* NOFAT* EXRCIS* DENTCK* 
     BLDPCK* PHYSCL* WRDENT* LOSTEE* PROSEX* HGHTFT* HGHTIN* WEIGHT* WGTEST*
	 CHOLCK* CHECK* FLUSHT* LSTETH* PSA* HYSTER* STOOL* WHENST* BOWEL* WHNBWL* 
	 PHYACT* BMINDX* SEATBE* ;	 
global varkeep_round_dates BEGRFD31 BEGRFM31 BEGRFY31 ENDRFD31 ENDRFM31 ENDRFY31 BEGRFD42 
		BEGRFM42 BEGRFY42 ENDRFD42 ENDRFM42 ENDRFY42 BEGRFD53 BEGRFM53 BEGRFY53 ENDRFD ENDRFM ENDRFY INSCOP31 INSCOP42 INSCOP53;		
global varkeep_limitations IADLHP* ADLHLP* AIDHLP* WLKLIM* LFTDIF* STPDIF* WLKDIF* MILDIF* 
		STNDIF* BENDIF* RCHDIF* FNGRDF* ACTLIM* WRKLIM* HSELIM* SCHLIM* UNABLE* SOCLIM* COGLIM*;		
global varkeep_health RTHLTH* MNHLTH* MDUNAB42 DNUNAB42 PMUNAB42 MDDLAY42 DNDLAY42 PMDLAY42 MDUNRS42 DNUNRS42 PMUNRS42 MDDLRS42 DNDLRS42 PMDLRS42 ACCELI42 ;
global varkeep_despair  K6SUM42 ADNERV42 ADHOPE42  ADREST42  ADSAD42  ADEFRT42  ADWRTH42  ADINTR42 ADDPRS42  ADCAPE42  ADNRGY42  ADDOWN42;



global varkeep_income TTLP* WAG* SSIDIS* PUBP* FAMINC DIVDP* INTRP* TRSTP*;
global varkeep_demographics SEX DOBMM DOBYY RACE* MARRY* EDUC* REGION* FAMSZE* EDRECODE* ;

global varkeep_health_other   CHOLDX* HIBPDX* BPMLDX* CHDD* ANGID* MID* OHRTD* EMPHD* DIABD* ASTHDX* ARTHDX* SRTHRT* ASSTIL* ASATAK* OHRTDX* STRKDX* JTPAIN* ARTHTX* ;

global varkeep_health_HCQ ADRISK* ADINSA* ADINSB* ADOVER*  ADILCR* ADILWW* ADRTCR* ADRTWW* ADAPPT* ADNDCR* ADNECP* ADLIST* ADEXPL* ADRESP* ADPRTM* ADHECR* ADSPEC* ADPRRE* ADGENH* ADDAYA* ADPALS* ADPWLM* ADMALS* ADPAIN* ;

//ADUTRM* ADSOCA*

global varkeep_health_exp TOTEXP TOTSLF INSCOV* TOTTCH* RXEXP* PYUNBL* CRFMPY* PROBPY*;

global varkeep_cancer CANCERDX* CABLADDR* CABRAIN* CABREAST* CACERVIX* CACOLON* CALEUKEM* CALUNG* CALYMPH* CAMELANO* CAOTHER* CAPROSTA* CASKINDK* CASKINNM* CATHROAT* CATHYROD* CERVAGED*;

keep PANEL YEAR DUID PID ID  WEIGHT SPOUID
	 $varkeep_income $varkeep_demographics $varkeep_health $varkeep_limitations
	 $varkeep_round_dates $varkeep_preventive_care $varkeep_emp $varkeep_SAQhealth $varkeep_health_other $varkeep_health_HCQ
	 $varkeep_health_exp $varkeep_despair $varkeep_cancer;
#delimit cr

}


rename WEIGHT Sample_Weight


*** PREPARE VARIABLES FOR RESHAPING. 
qui{
***TRANSFORMING THE DATASET SUCH THAT WE HAVE ONLY ONE OBSERVATION PER ID
order YEAR PANEL DUID PID ID SPOUID

sort ID YEAR
tab PANEL

gen YR_1_2=YEAR-PANEL
replace YR_1_2=1 if YR_1_2==1995
replace YR_1_2=2 if YR_1_2==1996


* VARIABLES AT ANNUAL LEVEL
qui{
	
local variables_annual "TOTEXP TOTSLF RXEXP INSCOV TOTTCH CANCERDX CABLADDR CABRAIN CABREAST CACERVIX CACOLON CALEUKEM CALUNG CALYMPH CAMELANO CAOTHER CAPROSTA CASKINDK CASKINNM CATHROAT CATHYROD CERVAGED "


foreach x of local variables_annual { 
gen `x'_Y1=`x' 	if YR_1_2==1
gen `x'_Y2=`x' 	if YR_1_2==2
drop `x'	
by ID: egen `x'_Y1_max=max(`x'_Y1)
by ID: egen `x'_Y2_max=max(`x'_Y2)
drop `x'_Y1 `x'_Y2
rename `x'_Y1_max `x'_Y1
rename `x'_Y2_max `x'_Y2
}

}

 
* variables that are round specific  - rounds 2 and 4
qui{ 
	
local variables_42 "MDUNAB DNUNAB PMUNAB MDDLAY DNDLAY PMDLAY   MDDLRS DNDLRS PMDLRS ACCELI  MDUNRS DNUNRS PMUNRS   K6SUM ADNERV ADHOPE  ADREST  ADSAD  ADEFRT  ADWRTH  ADINTR ADDPRS  ADCAPE  ADNRGY  ADDOWN ADSMOK ADRISK ADINSA ADINSB ADOVER  ADILCR ADILWW ADRTCR ADRTWW ADAPPT ADNDCR ADNECP ADLIST ADEXPL ADRESP ADPRTM ADHECR ADSPEC ADPRRE ADGENH ADDAYA ADPALS ADPWLM ADMALS ADPAIN  PCS MCS EQU"
//ADUTRM ADSOCA

/*
* SAQ General Health Questions: All available only in Rounds 2 and 4, and only since 2000

* ADSMOK42: Currently Smoke
* -9 Not Ascertained
* -7 Refused (Very few)
* -1 Inapplicable
*  1 Yes
*  2 No


* PCS42: Physical Component Summary SF12
* MCS42: Mental Component Summary SF12
* -9 Not Ascertained
* -1 Inapplicable
* Continuous variable: positive number 0 to 100.



* EQU42: EQ-5D Preference Based Index (available only 2000, 2001, 2002, 2003)
* -9 Not Ascertained
* -1 Inapplicable
* Continuous variable: positive number 0.0 to 1.0.



*/

foreach x of local variables_42 {
	gen `x'2=`x'42 if YR_1_2==1
	gen `x'4=`x'42 if YR_1_2==2
	order  `x'2 `x'4 

	sort ID YEAR
	foreach var of varlist (`x'2- `x'4){
		by ID: egen `var'_max=max(`var')
		drop `var'
		rename `var'_max `var' 
		}
	drop `x'42
	}
	sort ID YEAR
	
	

}



* variables that are round specific  - every round 1-5
qui{
	

/* EMP variables

//local variables_family "FAMSZE INSCOP"  //fam size, in scope
//local variables_emp "EMPST SELFCM NUMEMP DDNWRK HOUR STJBDD STJBMM STJBYY NWK MORJOB TEMPJB" //EMPLOYMENT VARIABLES (EMP, Self emp, firm size, days missed due to health problems, hours, 3 vars for job start date, reason not working, has more than one job , if temporary job, )


* EMPST:
*-9 Not Ascertained
*-8 DK
*-7 Refused
*-3 No Data In Round (1996 only. I don't know what this means. )
*-1 Inapplicable
* 1 Employed at RD [.] Interview Date
* 2 Job to Return to at RD [.] Interview Date
* 3 Job During RD [.] Ref Period
* 4 Not Employed During RD [.]


* Self-Employed
*-9 Not Ascertained
*-8 DK
*-7 Refused
*-3 No Data In Round (1996 only. I don't know what this means. )
*-1 Inapplicable
* 1 YES
* 2 NO

* Hours
*-9 Not Ascertained
*-8 DK
*-7 Refused
*-3 No Data In Round (1996 only. I don't know what this means. )
*-2 Determined in Previous Round (not in HOUR31)
*-1 Inapplicable


* Current Job Start Date
//order STJBDD STJBMM STJBYY STJBDD31 STJBMM31 STJBYY31 STJBDD42 STJBMM42 STJBYY42 STJBDD53 STJBMM53 STJBYY53
* STJBDD (dates) not availalbe in 2008 for confidentiality reason
*-9 Not Ascertained
*-8 DK
*-7 Refused
*-3 No Data In Round (1996 only. I don't know what this means. )
*-2 Determined in Previous Round (not in STJB__31)
*-1 Inapplicable

* Reason Not Working During ...
//order NWK NWK31 NWK42 NWK53
*-9 Not Ascertained
*-8 DK
*-7 Refused
*-3 No Data In Round (1996 only. I don't know what this means. )
*-1 Inapplicable
* 1 Could not find work
* 2 Retired
* 3 Unable to work because ill/disabled
* 4 On temporary layoff
* 5 Maternity/paternity leave
* 6 Going to school
* 7 Taking care of home or family
* 8 Wnated some time off
* 9 Waiting to start new job
*91 Other


* Has more than one job
//order MORJOB MORJOB31 MORJOB42 MORJOB53
*-9 Not Ascertained
*-8 DK
*-7 Refused
*-3 No Data In Round (1996 only. I don't know what this means. )
*-1 Inapplicable
* 1 More than one current job
* 2 Not more than one current job
*/

renvars HELD*, postdrop(1)
replace HELD31= HELD1 if YR_1_2==1 & PANEL==1 
replace HELD42= HELD2 if YR_1_2==1 & PANEL==1
drop HELD1 HELD2

renvars OFFER*X, postdrop(1)
replace OFFER31=OFFER1 if YR_1_2==1 & PANEL==1 
replace OFFER42=OFFER2 if YR_1_2==1 & PANEL==1 
drop OFFER1 OFFER2

renvars MARRY*X, postdrop(1)
replace MARRY31=MARRY1 if YR_1_2==1 & PANEL==1 
replace MARRY42=MARRY2 if YR_1_2==1 & PANEL==1 
drop MARRY1 MARRY2 MARRY

renvars MCAID*, postdrop(1)
renvars MCARE*, postdrop(1)
renvars PUB*, postdrop(1)

replace CHOIC31=CHOIC if YR_1_2==1 & PANEL==1
drop CHOIC

/* INS variables

//local variables_ins "HELD OFFER MCAID MCARE PUB CHOIC PRIVAT PRING PRIV PRIEU PRIS  PRIOG PRIDK PROUT PRSTX"

* HELD1X HELD2X HELDX HELD31X HELD42X HELD53X
*-9 Not Ascertained
*-8 DK
*-7 Refused
*-3 No Data In Round (1996 only. I don't know what this means. )
*-1 Inapplicable
* 1 YES
* 2 NO

* Health Insurance
//order OFFER31X OFFER42X OFFER53X  MCAID31X MCAID42X MCAID53X  MCARE31X MCARE42X MCARE53X PUB31X PUB42X PUB53X
*-9 Not Ascertained
*-8 DK
*-7 Refused
*-1 Inapplicable
* 1 YES
* 2 NO

* PRIVATE INSURANCE
* ANY TIME COVERED BY PRIVATE INSURANCE IN ROUND
* -1 INAP
* 1 YES
* 2 NO
*/


local variables_12345 "FAMSZE INSCOP MARRY EMPST SELFCM NUMEMP DDNWRK HOUR STJBDD STJBMM STJBYY NWK MORJOB TEMPJB HELD OFFER MCAID MCARE PUB CHOIC PRIVAT PRING PRIV PRIEU PRIS  PRIOG PRIDK PROUT PRSTX"  


foreach x of local variables_12345{
	gen `x'1=`x'31 if YR_1_2==1
	gen `x'2=`x'42 if YR_1_2==1
	gen `x'3=`x'31 if YR_1_2==2 & PANEL==1
	replace `x'3=`x'31 if YR_1_2==2 & PANEL!=1
	replace `x'3=`x'53 if YR_1_2==1 & PANEL!=1
	gen `x'4=`x'42 if YR_1_2==2
	gen `x'5=`x'53 if YR_1_2==2
	order `x'1 `x'2 `x'3 `x'4 `x'5

	sort ID YEAR
	foreach var of varlist (`x'1- `x'5) {
		by ID: egen `var'_max=max(`var')
		drop `var'
		rename `var'_max `var'
	}

	// determined in previous round
	replace `x'2=`x'1 if `x'2==-2 & `x'1!=.
	replace `x'3=`x'2 if `x'3==-2 & `x'2!=.
	replace `x'4=`x'3 if `x'4==-2 & `x'3!=.
	replace `x'5=`x'4 if `x'5==-2 & `x'4!=.
	sort ID YEAR
	
drop `x'31 `x'42 `x'53
}

}



* SPECIAL CASES
* Hourly Wage
qui{
order HRWG31X HRWG42X HRWG53X HRWG1X HRWG2X HRWGX NHRWG31 NHRWG42 NHRWG53
* HRWG31X HRWG42X HRWG53X HRWG1X HRWG2X HRWGX *-10 Top-Coded ($72.12, 2008; no top-coding in 1996)
*-9 Not Ascertained
*-3 No Data In Round (1996 only. I don't know what this means. )
*-2 Determined in the Previous Round (Not in 1X and 31X)
*-1 Inapplicable

* NHRWG31 NHRWG42 NHRWG53: Updated Hourly Wage (available only in and after 2004)
*-13 Initial Wage Imputed
*-10 Updated Hourly Wage Top-Coded ($72.12, 2008; )
* -9 Not Ascertained
* -1 Inapplicable

gen HRWG1=HRWG31X if YR_1_2==1
gen HRWG2=HRWG42X if YR_1_2==1
gen HRWG3=HRWG31X if YR_1_2==2 & PANEL==1 
replace HRWG3=HRWG31X if YR_1_2==2 & PANEL!=1 
replace HRWG3=HRWG53X if YR_1_2==1 & PANEL!=1 
gen HRWG4=HRWG42X if YR_1_2==2 
gen HRWG5=HRWG53X if YR_1_2==2 
replace HRWG1=HRWG1X if YR_1_2==1 & PANEL==1 
replace HRWG2=HRWG2X if YR_1_2==1 & PANEL==1

replace HRWG1=NHRWG31 if YR_1_2==1 & NHRWG31!=. & (NHRWG31==-10 | NHRWG31>0) 
replace HRWG2=NHRWG42 if YR_1_2==1 & NHRWG42!=. & (NHRWG42==-10 | NHRWG42>0) 
replace HRWG3=NHRWG53 if YR_1_2==1 & NHRWG53!=. & (NHRWG53==-10 | NHRWG53>0) 
replace HRWG3=NHRWG31 if YR_1_2==2 & NHRWG31!=. & (NHRWG31==-10 | NHRWG31>0) 
replace HRWG4=NHRWG42 if YR_1_2==2 & NHRWG42!=. & (NHRWG42==-10 | NHRWG42>0) 
replace HRWG5=NHRWG53 if YR_1_2==2 & NHRWG53!=. & (NHRWG53==-10 | NHRWG53>0)

order HRWG1 HRWG2 HRWG3 HRWG4 HRWG5

sort ID YEAR
foreach var of varlist (HRWG1- HRWG5) {
	by ID: egen `var'_max=max(`var')
	drop `var'
	rename `var'_max `var'
}
sort ID YEAR
order HRWG1 HRWG2 HRWG3 HRWG4 HRWG5
drop HRWG31X HRWG42X HRWG53X HRWG1X HRWG2X NHRWG31 NHRWG42 NHRWG53

replace HRWG2=HRWG1 if HRWG2==-2 & HRWG1!=. & (HRWG1==-10 | HRWG1>0.0) 
replace HRWG3=HRWG2 if HRWG3==-2 & HRWG2!=. & (HRWG2==-10 | HRWG2>0.0)
* replace HRWG3=HRWG1 if HRWG3==-2 & HRWG1!=. & (HRWG1==-10 | HRWG1>0.0) 
replace HRWG4=HRWG3 if HRWG4==-2 & HRWG3!=. & (HRWG3==-10 | HRWG3>0.0)
* replace HRWG4=HRWG2 if HRWG4==-2 & HRWG2!=. & (HRWG2==-10 | HRWG2>0.0)
* replace HRWG4=HRWG1 if HRWG4==-2 & HRWG1!=. & (HRWG1==-10 | HRWG1>0.0) 
replace HRWG5=HRWG4 if HRWG5==-2 & HRWG4!=. & (HRWG4==-10 | HRWG4>0.0)
* replace HRWG5=HRWG3 if HRWG5==-2 & HRWG3!=. & (HRWG3==-10 | HRWG3>0.0)
* replace HRWG5=HRWG2 if HRWG5==-2 & HRWG2!=. & (HRWG2==-10 | HRWG2>0.0)
* replace HRWG5=HRWG1 if HRWG5==-2 & HRWG1!=. & (HRWG1==-10 | HRWG1>0.0)
}


* DIFFICULTY PAYING BILLS
qui{
order PYUNBL* CRFMPY* PROBPY*
sort ID YEAR

gen     PYUNBL2=PYUNBL42 if YR_1_2==1
gen PYUNBL4=PYUNBL42 if YR_1_2==2

gen     CRFMPYL2=CRFMPY42 if YR_1_2==1
gen CRFMPY4=CRFMPY42 if YR_1_2==2

gen     PROBPY2=PROBPY42 if YR_1_2==1
gen PROBPY4=PROBPY42 if YR_1_2==2

order PYUNBL2 PYUNBL4 CRFMPYL2 CRFMPY4 PROBPY2 PROBPY4
sort ID YEAR
foreach var of varlist (PYUNBL2-PROBPY4) {
	by ID: egen `var'_max=max(`var')
	drop `var'
	rename `var'_max `var'
}

drop PYUNBL42 CRFMPY42 PROBPY42
}


***PREVENTIVE CARE VARIABLES***  
qui{
order  BPCHEK* ASPRIN* NOASPR* STOMCH* BPMONT* NOFAT* EXRCIS* DENTCK* BLDPCK* PHYSCL* WRDENT* LOSTEE* PROSEX* HGHTFT* HGHTIN* WEIGHT* WGTEST* CHOLCK* CHECK* FLUSHT* LSTETH* PSA* HYSTER* STOOL* WHENST* BOWEL* WHNBWL* PHYACT* BMINDX* SEATBE*

*information on these variables is collected in rounds 3 and 5 (therefore, these variables always end in 53)
*these variables are present in  1996, 1998 and 2000 and after

*  in 1996, these variables are named a bit differently:
*DENTCHK3 frequency of dental check-ups
*BLDPCHK3 time since last having blood pressure taken by a doctor, nurse, or other health professional
*CHOLCHK3 time since last checking cholesterol level
*PHYSICL3 time since last complete physical
*FLUSHOT3 time since last flu shot
*WEARDEN3 does person wear dentures
*LOSTEET3 has person lost all adult teeth
*PROSEXA3 time since last prostate exam 


*  in 1998, these variables are also named a bit differently:
*DENTCK98 = frequency of dental check-ups
*BLDPCK98 =time since last having blood pressure taken by a doctor, nurse, or other health professional
*CHOLCK98 = time since last checking cholesterol level
*PHYSCL98 = time since last complete physical
*FLUSHT98 = time since last flu shot
*WRDENT98 = does person wear dentures (Age > 34; both genders)
*LOSTEE98 = has person lost all adult teeth (Age > 34; both genders)
*PROSEX98 = time since last prostate exam

*In years after 2000, these variables are named:
*DENTCK53 - on average, frequency of dental check-up
*CHOLCK53 - about how long since last blood cholesterol check by doctor or health professional
*CHECK53 - how long since last routine check-up by doctor or other health professional for assessing overall health
*FLUSHT53 - how long since last flu shot
*LSTETH53 - has person lost all natural (permanent) teeth
*PSA53 - how long since last prostate specific antigen (PSA) test

* more variables are introduced in 2007

*Fix the fact that in 1996 and 1998 variable names are different:
*DENTCHK3 and DENTCK98 become DENTCK53
*BLDPCHK3 become BLDPCK
*CHOLCHK3 and CHOLCK98 become CHOLCK5
*PHYSICL3 and PHYSCL98 become CHECK53
*FLUSHOT3 and FLUSHT98 become FLUSHT53
*WEARDEN3 become  WRDENT
*LOSTEET3 and LOSTEE98 become LSTETH53
*PROSEXA3 and PROSEX98 become PSA53

replace DENTCK53=DENTCK if YEAR==1998 & DENTCK53==.
replace DENTCK53=DENTCHK3 if YEAR==1996 & DENTCK53==.

replace BLDPCK=BLDPCHK3 if YEAR==1996

replace CHOLCK53=CHOLCK if YEAR==1998 & CHOLCK53==.
replace CHOLCK53=CHOLCHK3 if YEAR==1996 & CHOLCK53==.

replace CHECK53=PHYSCL if YEAR==1998 & CHECK53==.
replace CHECK53=PHYSICL3 if YEAR==1996 & CHECK53==.

replace FLUSHT53=FLUSHT if YEAR==1998 & FLUSHT53==.
replace FLUSHT53=FLUSHOT3 if YEAR==1996 & FLUSHT53==.

replace WRDENT=WEARDEN3 if YEAR==1996

replace LSTETH53=LOSTEE if YEAR==1998 & LSTETH53==.
replace LSTETH53=LOSTEET3 if YEAR==1996 & LSTETH53==.

replace PSA53=PROSEX if YEAR==1998 & PSA53==.
replace PSA53=PROSEXA3 if YEAR==1996 & PSA53==.

drop DENTCK DENTCHK3 BLDPCHK3 CHOLCK CHOLCHK3 PHYSCL PHYSICL3 FLUSHT FLUSHOT3 WEARDEN3 LOSTEE LOSTEET3 PROSEX PROSEXA3
drop PHYSCL42
*(this var was only for kids under 17)

rename BLDPCK BLDPCK53
rename WRDENT WRDENT53

sort ID YEAR
local variables "BPCHEK ASPRIN NOASPR STOMCH BPMONT NOFAT EXRCIS DENTCK BLDPCK  WRDENT HGHTFT HGHTIN WEIGHT WGTEST CHOLCK CHECK FLUSHT LSTETH PSA HYSTER STOOL WHENST BOWEL WHNBWL PHYACT BMINDX SEATBE"
foreach x of local variables {
gen `x'3=`x'53 if YR_1_2==1
gen `x'5=`x'53 if YR_1_2==2
drop `x'53
}

sort ID YEAR 
order BPCHEK* ASPRIN* NOASPR* STOMCH* BPMONT* NOFAT* EXRCIS* DENTCK* BLDPCK*  WRDENT* HGHTFT* HGHTIN* WEIGHT* WGTEST* CHOLCK* CHECK* FLUSHT* LSTETH* PSA* HYSTER* STOOL* WHENST* BOWEL* WHNBWL* PHYACT* BMINDX* SEATBE*
foreach var of varlist (BPCHEK3 - SEATBE5){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
   
}

*** INCOME VARIABLES ***
qui{
* income and tax-related variables were constructed primarily from data collected in the Panel N Round 5 and Panel N+1 Round 3 Income Sections.
* so these variables are not for every round. They should be annual. (it doesn't say so in the documentation explicitly though, but they look like it.)

order TTLP* WAG* SSIDIS* PUBP* FAMINC DIV* INT* TRS*

*the variable WAGEPNX was constructed, and imputations were made as follows: 
*   WAGEIMP=1 means the response was not edited
*   WAGEIMP=2 means respondents provided broad income ranges rather than giving specific dollar amounts.  
*             They used weighted sequential hot-decking to provide these individuals with specific dollar amounts
*   WAGEIMP=3 means person did not report wage and salary income and were assigned WAGEPNX=0 based on 
*             either being under 16 or not having been employed during the year
*   WAGEIMP=4 means person did not provide valid dollar amounts or dollar ranges, 
*             but for whom they had information from the employment sections of Rounds 1, 2,
*             and 3 concerning wages, hours, and weeks worked (in all jobs).   They used these data to construct
*             annualized wage amounts in place of missing annual wage and salary data.  
*             NOTE: part-year responders were assumed to be fully-employed during the remainder of the year if they were 
*			        employed during the period in which they provided data – the exception being those who left due to death or 
*			        institutionalization.  These persons were assigned zero wages and salaries for the time they were not in MEPS.
*   WAGEIMP=5 means persons with missing WAGEPNX who were deemed to have been employed and were hot-decked 
*             in conditional imputations that used only donors with positive WAGEPNX amounts
*   WAGEIMP=6 means persons with missing WAGEPNX who were hot-decked
*             WAGEPNX in an unconditional imputation that used both workers and nonworkers as donors

* TO SUMMARIZE: 
*   1=Original response used; 
*   2=Bracket converted; 
*   3=Missing value set to 0; 
*   4=Weeks worked/earnings used(WAGEIMP only); 
*   5=Conditional hotdeck; 
*   6=Unconditional hotdeck

replace WAGPX=WAGPNX if WAGPX==. & YEAR==1996 & WAGPNX!=.  
drop WAGPNX

*since this variable is annual, I am going to construct 2 new variables
* WAGE_Y1 and WAGE_Y2 where WAGE_Y1 equals the annual wage in the first year in which 
* the person was interviewed and WAGE_Y2 in the second year

*I will do a similar thing for WAGIMP: generate WAGEIMP_Y1 AND WAGEIMP_Y2
gen WAGE_Y1=WAGPX if YR_1_2==1
gen WAGE_Y2=WAGPX if YR_1_2==2

gen WAGEIMP_Y1=WAGIMP if YR_1_2==1
gen WAGEIMP_Y2=WAGIMP if YR_1_2==2

*also, similar for total income:
* NOTE:
*TOTAL INCOME
* Total person-level income (TTLPNX) is the sum of all income components with the exception of
* REFDPNX and SALEPNX (so that we are following as closely as possible the CPS definition of income).  
* The Documentation notes that "Some researchers may wish to define their own income measure by adding in one or
* both of these excluded components."

*** TOP CODING**
*"All income amounts on the file, including both total income and the separate sources of income,
*Were top coded to preserve confidentiality.  For each income source, top codes were applied to the
*top percentile of all cases (including negative amounts that exceeded income thresholds in
*absolute value).  In cases where fewer than one percent of all persons received a particular income
*source, we top-coded all recipients. "
replace TTLPX=TTLPNX if TTLPX==. & YEAR==1996 & TTLPNX!=.  
drop TTLPNX

gen TOTALINC_Y1=TTLPX if YR_1_2==1
gen TOTALINC_Y2=TTLPX if YR_1_2==2

gen FAMINC_Y1=FAMINC if YR_1_2==1
gen FAMINC_Y2=FAMINC if YR_1_2==2

drop WAGIMP WAGPX TTLPX FAMINC

order WAGE_Y* WAGEIMP_Y* TOTALINC_Y* FAMINC_Y*
sort ID YEAR 
foreach var of varlist (WAGE_Y1-FAMINC_Y2){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
sort ID YEAR 


* income from interest, dividents, and rent/trusts - we may want to subtract this from total income 
replace DIVDPX=DIVDPNX if DIVDPX==. & YEAR==1996 & DIVDPNX!=.  
drop DIVDPNX
replace INTRPX=INTRPNX if INTRPX==. & YEAR==1996 & INTRPNX!=.  
drop INTRPNX
replace TRSTPX=TRSTPNX if TRSTPX==. & YEAR==1996 & TRSTPNX!=.  
drop TRSTPNX
gen DIVINC_Y1=DIVDPX if YR_1_2==1
gen DIVINC_Y2=DIVDPX if YR_1_2==2
gen INTINC_Y1=INTRPX if YR_1_2==1
gen INTINC_Y2=INTRPX if YR_1_2==2
gen TRINC_Y1=TRSTPX if YR_1_2==1
gen TRINC_Y2=TRSTPX if YR_1_2==2

drop DIVDPX INTRPX TRSTPX

order DIVINC_Y* INTINC_Y* TRINC_Y*
sort ID YEAR 
foreach var of varlist (DIVINC_Y1-TRINC_Y2){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
sort ID YEAR

*** SSI due to disability
*-8 DK 2 8,616
*-1 INAPPLICABLE 
*1 DISABILITY 
*2 SOME OTHER REASON


gen SSIDIS_Y1= SSIDISAB if YR_1_2==1 & YEAR==1996
replace SSIDIS_Y1= SSIDIS if YR_1_2==1 & YEAR!=1996
gen SSIDIS_Y2=SSIDIS if YR_1_2==2


gen PUBP_Y1=PUBP if YR_1_2==1
gen PUBP_Y2=PUBP if YR_1_2==2

drop SSIDISAB SSIDIS PUBP
sort ID YEAR 
foreach var of varlist ( SSIDIS_Y1 SSIDIS_Y2 PUBP_Y1 PUBP_Y2){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
sort ID YEAR 
}


* industry and occup
qui{
order CIND31 CIND42 CIND53 INDCAT31 INDCAT42 INDCAT53 OCCCAT31 OCCCAT42 OCCCAT53

gen CIND1=CIND31 if YR_1_2==1
gen CIND2=CIND42 if YR_1_2==1
gen CIND3=CIND31 if YR_1_2==2 & PANEL==1
replace CIND3=CIND53 if YR_1_2==1 & PANEL!=1
gen CIND4=CIND42 if YR_1_2==2
gen CIND5=CIND53 if YR_1_2==2

gen INDCAT1=INDCAT31 if YR_1_2==1
gen INDCAT2=INDCAT42 if YR_1_2==1
gen INDCAT3=INDCAT31 if YR_1_2==2 & PANEL==1
replace INDCAT3=INDCAT53 if YR_1_2==1 & PANEL!=1
gen INDCAT4=INDCAT42 if YR_1_2==2
gen INDCAT5=INDCAT53 if YR_1_2==2

gen OCCCAT1=OCCCAT31 if YR_1_2==1
gen OCCCAT2=OCCCAT42 if YR_1_2==1
gen OCCCAT3=OCCCAT31 if YR_1_2==2 & PANEL==1
replace OCCCAT3=OCCCAT53 if YR_1_2==1 & PANEL!=1
gen OCCCAT4=OCCCAT42 if YR_1_2==2
gen OCCCAT5=OCCCAT53 if YR_1_2==2

order CIND1 CIND2 CIND3 CIND4 CIND5 INDCAT1 INDCAT2 INDCAT3 INDCAT4 INDCAT5 OCCCAT1 OCCCAT2 OCCCAT3 OCCCAT4 OCCCAT5 

*** occupations in 2001
gen COCCP1=COCCP31 if YR_1_2==1
gen COCCP2=COCCP42 if YR_1_2==1
gen COCCP3=COCCP31 if YR_1_2==2 & PANEL==1
replace COCCP3=COCCP53 if YR_1_2==1 & PANEL!=1
gen COCCP4=COCCP42 if YR_1_2==2
gen COCCP5=COCCP53 if YR_1_2==2

sort ID YEAR 
foreach var of varlist (CIND1- OCCCAT5){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
sort ID YEAR 
order CIND1 CIND2 CIND3 CIND4 CIND5 INDCAT1 INDCAT2 INDCAT3 INDCAT4 INDCAT5 OCCCAT1 OCCCAT2 OCCCAT3 OCCCAT4 OCCCAT5 COCCP1 COCCP2 COCCP3 COCCP4 COCCP5
drop CIND31 CIND42 CIND53  INDCAT31 INDCAT42 INDCAT53 OCCCAT31 OCCCAT42 OCCCAT53 COCCP31 COCCP42 COCCP53


* union membership
order UNION31 UNION42 UNION53
gen UNION1=UNION31 if YR_1_2==1
gen UNION2=UNION42 if YR_1_2==1
gen UNION3=UNION31 if YR_1_2==2 & PANEL==1
replace UNION3=UNION53 if YR_1_2==1 & PANEL!=1
gen UNION4=UNION42 if YR_1_2==2
gen UNION5=UNION53 if YR_1_2==2

order UNION1 UNION2 UNION3 UNION4 UNION5
sort ID YEAR 
foreach var of varlist (UNION1- UNION5){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
sort ID YEAR 
order UNION1 UNION2 UNION3 UNION4 UNION5
drop  UNION31 UNION42 UNION53 


**** we need to fix the fact that many of these variables have a -2 in rounds 2 and after, which means the answer is the same as in the previous round
local variables1 "UNION INDCAT OCCCAT  "
foreach x of local variables1 {
	replace `x'2=`x'1 if `x'2==-2 & `x'1!=.
	replace `x'3=`x'2 if `x'3==-2 & `x'2!=.
	replace `x'4=`x'3 if `x'4==-2 & `x'3!=.
	replace `x'5=`x'4 if `x'5==-2 & `x'4!=.
	sort ID YEAR

}

}

***begining and end dates for each round
qui{
order YR_1_2 YEAR PANEL DUID PID ID

gen BEGRFD1 = BEGRFD31 if YR_1_2==1
gen BEGRFM1 = BEGRFM31 if YR_1_2==1
gen BEGRFY1 = BEGRFY31 if YR_1_2==1
gen ENDRFD1 = ENDRFD31 if YR_1_2==1
gen ENDRFM1 = ENDRFM31 if YR_1_2==1
gen ENDRFY1 = ENDRFY31 if YR_1_2==1

gen BEGRFD2 = BEGRFD42 if YR_1_2==1
gen BEGRFM2 = BEGRFM42 if YR_1_2==1
gen BEGRFY2 = BEGRFY42 if YR_1_2==1
gen ENDRFD2 = ENDRFD42 if YR_1_2==1
gen ENDRFM2 = ENDRFM42 if YR_1_2==1
gen ENDRFY2 = ENDRFY42 if YR_1_2==1

gen BEGRFD3 = BEGRFD53 if YR_1_2==1
gen BEGRFM3 = BEGRFM53 if YR_1_2==1
gen BEGRFY3 = BEGRFY53 if YR_1_2==1
gen ENDRFD3 = ENDRFD31 if YR_1_2==2
gen ENDRFM3 = ENDRFM31 if YR_1_2==2
gen ENDRFY3 = ENDRFY31 if YR_1_2==2
replace BEGRFD3 = BEGRFD31 if YR_1_2==2 & PANEL==1
replace BEGRFM3 = BEGRFM31 if YR_1_2==2 & PANEL==1
replace BEGRFY3 = BEGRFY31 if YR_1_2==2 & PANEL==1


gen BEGRFD4 = BEGRFD42 if YR_1_2==2
gen BEGRFM4 = BEGRFM42 if YR_1_2==2
gen BEGRFY4 = BEGRFY42 if YR_1_2==2
gen ENDRFD4 = ENDRFD42 if YR_1_2==2
gen ENDRFM4 = ENDRFM42 if YR_1_2==2
gen ENDRFY4 = ENDRFY42 if YR_1_2==2

gen BEGRFD5 = BEGRFD53 if YR_1_2==2
gen BEGRFM5 = BEGRFM53 if YR_1_2==2
gen BEGRFY5 = BEGRFY53 if YR_1_2==2
gen ENDRFD5 = ENDRFD if YR_1_2==2
gen ENDRFM5 = ENDRFM if YR_1_2==2
gen ENDRFY5 = ENDRFY if YR_1_2==2

sort ID YEAR 
order BEGRFD1 BEGRFM1 BEGRFY1 ENDRFD1 ENDRFM1 ENDRFY1 BEGRFD2 BEGRFM2 BEGRFY2 ENDRFD2 ENDRFM2 ENDRFY2 BEGRFD3 BEGRFM3 BEGRFY3 ENDRFD3 ENDRFM3 ENDRFY3 BEGRFD4 BEGRFM4 BEGRFY4 ENDRFD4 ENDRFM4 ENDRFY4 BEGRFD5 BEGRFM5 BEGRFY5 ENDRFD5 ENDRFM5 ENDRFY5
foreach var of varlist (BEGRFD1- ENDRFY5){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
sort ID YEAR 

*by ID: keep if ENDRFD1==BEGRFD2 & ENDRFM1==BEGRFM2 & ENDRFY1==BEGRFY2
*there are more than 4000 observations deleted.. so we can't assume that end of refrence period 1 is always the beginning of 2, etc
drop  BEGRFD31 BEGRFM31 BEGRFY31 ENDRFD31 ENDRFM31 ENDRFY31 BEGRFD42 BEGRFM42 BEGRFY42 ENDRFD42 ENDRFM42 ENDRFY42 ENDRFD ENDRFM ENDRFY
drop  BEGRFD53 BEGRFM53 BEGRFY53
}


*race
qui{
drop  RACEAX RACEBX RACEWX
rename RACEX RACE
recode RACE (4=2) (1=3) (2 =3) (3 =7) (5=1) (91=8) if YEAR<=2001

*tab RACE if YEAR==2000 
*tab RACE if YEAR==2001 
*tab RACE if YEAR==2002 
*tab RACE if YEAR==2003
* note that "asian or pacific islander has been recoded to "Asian" since in all analysis, 
* i will group asian with pacific islander anyway
* also 91 Other has been recoded to "multiple races"

/*
- 1996
1 AMERICAN INDIAN 
2 ALEUT, ESKIMO 
3 ASIAN OR PACIFIC ISLANDER 
4 BLACK 
5 WHITE 
91 OTHER

- 2000
1 AMERICAN INDIAN 
2 ALEUT, ESKIMO 
3 ASIAN OR PACIFIC ISLANDER 
4 BLACK


- 2001
1 AMERICAN INDIAN 
2 ALEUT, ESKIMO 
3 ASIAN OR PACIFIC ISLANDER 
4 BLACK
5 WHITE

- 2002 ONWARDS
1 WHITE - NO OTHER RACE REPORTED 30,618 233,530,031
2 BLACK - NO OTHER RACE REPORTED 6,018 35,504,007
3 AMER INDIAN/ALASKA NATIVE - NO OTH RAC 436 2,582,756
4 ASIAN - NO OTHER RACE REPORTED 1,384 11,566,792
5 NATIVE HAWAIIAN/PACIFIC ISLANDER-NO OTHR 108 884,317
6 MULTIPLE RACES REPORTED

1 WHITE - NO OTHER RACE REPORTED 23,889 246,695,427
2 BLACK - NO OTHER RACE REPORTED 7,124 40,644,939
3 AMER INDIAN/ALASKA NATIVE-NO OTHER RACE 360 2,965,867
4 ASIAN/NATV HAWAIIAN/PACFC
6 MULTIPLE RACES REPORTED

*/

*1 WHITE - NO OTHER RACE REPORTED	
*2 BLACK - NO OTHER RACE REPORTED	
*3 AMER INDIAN/ALASKA NATIVE - NO OTH RAC	
*4 ASIAN - NO OTHER RACE REPORTED	
*5 NATIVE HAWAIIAN/PACIFIC ISLANDER-NO OTHR	
*6 MULTIPLE RACES REPORTED
rename  RACETHNX  RACETHN

* earlier years:
*1 PERSON IS HISPANIC 
*2 PERSON IS BLACK/NOT HISPANIC 
*3 OTHER

*1 PERSON IS HISPANIC	
*2 PERSON IS BLACK-NO OTH RACE/NOT HISPANIC	
*3 PERSON IS ASIAN-NO OTH RACE/NOT HISPANIC	- IN EARLIER YEARS, 3 IS JUST "OTHER"
*4 OTHER RACE/NOT HISPANIC
}


*education
qui{
order  EDUCYR EDUCYR31 EDUCYR42 EDUCYR53 EDUCYEAR
*the educ variables are not available for every round in all years. Only in 1996, 1997 and 1998 do we have round specific data
*Between 1999 and 2004 (inclusive) we have the variable EDUCYEAR which tells us the # yrs of education the person had when he first entered MEPS
*For all other years, we have the variable EDUCYR which tells us different things depending on the year:
*     1996: COMPLETED YEARS OF EDUCATION - 12/31/96
*     1997: COMPLETED YEARS OF EDUCATION - 12/31/97
*     1998: COMPLETED YEARS OF EDUCATION - 12/31/98
*     2005, 06, 07, 08: YEARS OF EDUC WHEN FIRST ENTERED MEPS
* 
sort YEAR
*tabstat EDUCYEAR, by(YEAR)
*tabstat EDUCYR, by(YEAR)
*tabstat EDUCYR31, by(YEAR)

* I construct a single variable EDUC equal to EDUCYR31 in 96, 97, 98 in order to get as close as 
* possible to the education when the person first entered MEPS in these years. For all other years, 
* the educ variable is equal to EDUCYEAR or EDUCYR which tell us already the years of education completed
* when first entered MEPS
sort ID
gen EDUC = EDUCYR31 if YR_1_2==1 & (YEAR==1996 |  YEAR==1997 | YEAR==1998)
*in case there is missing data on educ in the first round, replace it with info on educ from following rounds
replace EDUC = EDUCYR42 if EDUC==. & YR_1_2==1 & (YEAR==1996 |  YEAR==1997 | YEAR==1998)
replace EDUC = EDUCYR53 if EDUC==. & YR_1_2==1 & (YEAR==1996 |  YEAR==1997 | YEAR==1998)
replace EDUC = EDUCYR42 if EDUC<0 & YR_1_2==1 & (YEAR==1996 |  YEAR==1997 | YEAR==1998)
replace EDUC = EDUCYR53 if EDUC<0 & YR_1_2==1 & (YEAR==1996 |  YEAR==1997 | YEAR==1998)
sort ID YEAR
*if only round 3, 4, and 5 data is available
replace EDUC = EDUCYR31 if (EDUC==. | EDUC<0) & YR_1_2==2 & (YEAR==1996 |  YEAR==1997 | YEAR==1998) & ID!=ID[_n-1]
replace EDUC = EDUCYR42 if (EDUC==. | EDUC<0) & YR_1_2==2 & (YEAR==1996 |  YEAR==1997 | YEAR==1998) & ID!=ID[_n-1]
replace EDUC = EDUCYR53 if (EDUC==. | EDUC<0) & YR_1_2==2 & (YEAR==1996 |  YEAR==1997 | YEAR==1998) & ID!=ID[_n-1]
replace EDUC = EDUCYEAR if YEAR>1998 & YEAR<2005
replace EDUC = EDUCYR if YEAR>=2005
replace EDUC = . if EDUC<0

* 2012-4
replace EDUC = . if YEAR==2012 | YEAR==2013 | YEAR==2014
*2012
replace EDUC = 11 if EDRECODE>=0 & EDRECODE<=12 & YEAR==2012 
replace EDUC = 12 if EDRECODE==13 & YEAR==2012 
replace EDUC = 13 if EDRECODE==14 & YEAR==2012 
replace EDUC = 16 if (EDRECODE==15 | EDRECODE==16)  & YEAR==2012 
*2013-4
replace EDUC = 11 if EDRECODE>=1 & EDRECODE<=2 & (YEAR==2013 | YEAR==2014) 
replace EDUC = 12 if EDRECODE==13 & (YEAR==2013 | YEAR==2014) 
replace EDUC = 13 if EDRECODE==14 & (YEAR==2013 | YEAR==2014) 
replace EDUC = 16 if (EDRECODE==15 | EDRECODE==16)  & (YEAR==2013 | YEAR==2014) 


sort ID YEAR   
by ID: egen EDUC_min=min(EDUC)
drop EDUC
rename EDUC_min EDUC
order EDUC
drop  EDUCYR EDUCYR31 EDUCYR42 EDUCYR53 EDUCYEAR

*  EDUCATION VARS CHANGE IN YEARS 2012-14 
* REPLACE EDUC_TYPE BASED ON OTHER VARIABLES
}


*self reported health, mental health, and limitations
qui{
*RTHLTH = perceived health status
*MNHLTH = perceived mental health status

*IADLHP31	IADL Screener
*ADLHLP31	ADL Screener 
*AIDHLP31	Used Assistive Devices 
*WLKLIM31	Limitation In Physical Functioning
*LFTDIF31	Difficulty Lifting 10 Pounds
*STPDIF31	Difficulty Walking Up 10 Steps 
*WLKDIF31	Difficulty Walking 3 Blocks 
*MILDIF31	Difficulty Walking A Mile 
*STNDIF31	Difficulty Standing 20 Minutes
*BENDIF31	Difficulty Bending/Stooping 
*RCHDIF31	Difficulty Reaching Overhead
*FNGRDF31	Difficulty Using Fingers To Grasp 
*ACTLIM31	Any Limitation Work/Housewrk/Schl 
*WRKLIM31	Work Limitation 
*HSELIM31	Housework Limitation
*SCHLIM31	School limitations
*UNABLE31	Completely unable to do activity
*SOCLIM31	Social limitations
*COGLIM31	Cognitive limitations

* only  RTHLTH MNHLTH IADLHP ADLHLP are available in all rounds
* Perceived health status and ADL and IADL limitations were measured in all rounds. 
* Functional and activity limitations were measured in Rounds 3 and 5 for Panel N and Rounds 1 and 3 for Panel N+1.

*RTHLTH MNHLTH IADLHP ADLHLP
sort ID YEAR
order RTHLTH*  MNHLTH* IADLHP*  ADLHLP* 
local variables "RTHLTH MNHLTH IADLHP ADLHLP"
foreach x of local variables {
gen `x'1=`x'31 if YR_1_2==1
gen `x'2=`x'42 if YR_1_2==1
gen `x'3=`x'31 if YR_1_2==2
replace `x'3=`x'53 if YR_1_2==1
gen `x'4=`x'42 if YR_1_2==2
gen `x'5=`x'53 if YR_1_2==2
}
sort ID YEAR 
order RTHLTH1 RTHLTH2 RTHLTH3 RTHLTH4 RTHLTH5 MNHLTH1 MNHLTH2 MNHLTH3 MNHLTH4 MNHLTH5 IADLHP1 IADLHP2 IADLHP3 IADLHP4 IADLHP5 ADLHLP1 ADLHLP2 ADLHLP3 ADLHLP4 ADLHLP5
foreach var of varlist ( RTHLTH1-ADLHLP5){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
drop  RTHLTH42 RTHLTH31 RTHLTH53 MNHLTH42 MNHLTH31 MNHLTH53 IADLHP42 IADLHP31 IADLHP53 ADLHLP42 ADLHLP31 ADLHLP53

*OTHER LIMITATION VARIABLES
sort ID YEAR
local variables "AIDHLP	WLKLIM	LFTDIF	STPDIF	WLKDIF	MILDIF	STNDIF	BENDIF	RCHDIF	FNGRDF	ACTLIM	WRKLIM	HSELIM	SCHLIM	UNABLE	SOCLIM	COGLIM"
foreach x of local variables {
gen `x'1=`x'31 if YR_1_2==1
gen `x'3=`x'53 if YR_1_2==1
replace `x'3=`x'31 if YR_1_2==2
gen `x'5=`x'53 if YR_1_2==2
}
sort ID YEAR 
order   AIDHLP1 AIDHLP3 AIDHLP5 WLKLIM1 WLKLIM3 WLKLIM5 LFTDIF1 LFTDIF3 LFTDIF5 STPDIF1 STPDIF3 STPDIF5 WLKDIF1 WLKDIF3 WLKDIF5 MILDIF1 MILDIF3 MILDIF5 STNDIF1 STNDIF3 STNDIF5 BENDIF1 BENDIF3 BENDIF5 RCHDIF1 RCHDIF3 RCHDIF5 FNGRDF1 FNGRDF3 FNGRDF5 ACTLIM1 ACTLIM3 ACTLIM5 WRKLIM1 WRKLIM3 WRKLIM5 HSELIM1 HSELIM3 HSELIM5 SCHLIM1 SCHLIM3 SCHLIM5 UNABLE1 UNABLE3 UNABLE5 SOCLIM1 SOCLIM3 SOCLIM5 COGLIM1 COGLIM3 COGLIM5
foreach var of varlist (AIDHLP1 - COGLIM5){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
drop   AIDHLP31 WLKLIM31 LFTDIF31 STPDIF31 WLKDIF31 MILDIF31 STNDIF31 BENDIF31 RCHDIF31 FNGRDF31 ACTLIM31 WRKLIM31 HSELIM31 SCHLIM31 UNABLE31 SOCLIM31 COGLIM31 AIDHLP53 WLKLIM53 LFTDIF53 STPDIF53 WLKDIF53 MILDIF53 STNDIF53 BENDIF53 RCHDIF53 FNGRDF53 ACTLIM53 WRKLIM53 HSELIM53 SCHLIM53 UNABLE53 SOCLIM53 COGLIM53

}


*Risk factors: cholesterol and hypertension
qui{
*Cholesterol: CHOLDX (=1 yes, high chol, =2 no)
*Hypertension: HIPPDX (=1 yes, =2 no). Available starting with panel 4.
  *(in 2007 and after, there is only one constructed variable per year(not round specific)
  *but for previous years, this is asked in rounds 5 and 3. 
  *For 2007 and after, I replace the value of these variables in Rounds 5 and 3 with the non-round specific value.
  *This is because the question asks whether "ever diagnosed with ...", so if the answer is yes or no in a particular year, than it must be the same in Round 5 and 3.
    

order CHOLD* HIBPDX* BPMLDX* CHDDX* ANGIDX* MIDX*  EMPHDX* DIABDX* ASTHDX* ARTHDX* SRTHRT* ASSTIL* ASATAK* OHRTDX* STRKDX* JTPAIN* ARTHTX* 

sort ID YEAR
local variables "CHOLDX HIBPDX BPMLDX CHDDX ANGIDX MIDX  EMPHDX DIABDX ASTHDX ARTHDX OHRTDX STRKDX"
foreach x of local variables {
replace  `x'53= `x' if `x'53==. & `x'!=.
drop `x'
gen `x'3=`x'53 if YR_1_2==1
gen `x'5=`x'53 if YR_1_2==2
drop `x'53
}



* only 53: SRTHRT ARTHTX
local variables "SRTHRT ARTHTX"
foreach x of local variables {
gen `x'3=`x'53 if YR_1_2==1
gen `x'5=`x'53 if YR_1_2==2
drop `x'53
}


* different: JTPAIN 
replace JTPAIN53 = JTPAIN53_M18 if JTPAIN53 ==. & JTPAIN53_M18!=.
replace JTPAIN31 = JTPAIN31_M if JTPAIN31==. & JTPAIN31_M!=. 
replace JTPAIN31 = JTPAIN31_M18 if JTPAIN31==. & JTPAIN31_M18!=. 
drop JTPAIN31_M JTPAIN31_M18 JTPAIN53_M18

* 53 and 31: ASSTIL ASATAK JTPAIN
local variables "ASSTIL ASATAK JTPAIN "
foreach x of local variables {
gen `x'1=`x'31 if YR_1_2==1
gen `x'3=`x'53 if YR_1_2==1
replace `x'3=`x'31 if YR_1_2==2
gen `x'5=`x'53 if YR_1_2==2
drop `x'53  `x'31
}


sort ID YEAR 
order CHOLDX* HIBPDX* BPMLDX* CHDDX* ANGIDX* MIDX*  EMPHDX* DIABDX* ASTHDX* ARTHDX* SRTHRT* ASSTIL* ASATAK* OHRTDX* STRKDX* JTPAIN* ARTHTX* 
foreach var of varlist ( CHOLDX3 -ARTHTX5){       
by ID: egen `var'_max=max(`var')
drop `var'
rename `var'_max `var'
}
*
sort ID YEAR
}


order  ID DUID PID YEAR PANEL YR_1_2 DOBMM DOBYY RACE RACETHN EDUC  

**drop what we forgot to drop before 
cap drop  EMPST SELFCM HRWGX HELD HOUR STJBYY53 STJBMM53 STJBDD53 STJBYY42 STJBMM42 STJBDD42 STJBYY31 STJBMM31 STJBDD31 NWK MORJOB
}


***now we have two observations for each individual, but they are identical except for YEAR AND YR_1_2. 
* drop the second observation (it doesn't matter which one)
* however, if there is only one observation (the person was interviewed in only one year, I keep that observation)
sort PANEL ID YR_1_2
drop if ID==ID[_n-1]
sort ID
compress



* DATA CLEANING 
qui{

* Creating an Age variable (age at the beginning of the Round)
qui{
*We only know the Month and Year of Birth, so we cannot know for sure the age at the beginning of each round
*Construct Round specific Age = Age at the beginning of each Round. If the Month of Birth coincides with the 
*   month in which the Round begins, than assume the person has already had their birthday that month. 
foreach num of numlist 1 2 3 4 5 {
replace BEGRFY`num'=. if BEGRFY`num'<=0
replace BEGRFM`num'=. if BEGRFM`num'<=0
gen AGE`num'=.
replace AGE`num'= BEGRFY`num'-DOBYY-1 if BEGRFM`num'<DOBMM
replace AGE`num'= BEGRFY`num'-DOBYY if BEGRFM`num'>=DOBMM
replace AGE`num'=. if AGE`num'<0
}
}

* Self reported health and mental health
qui{
foreach var of varlist (RTHLTH1 RTHLTH2 RTHLTH3 RTHLTH4 RTHLTH5 ) {
	replace `var'=. if `var'<=0
}
label define health 1 "Excellent" 2 "Very Good" 3 "Good" 4 "Fair" 5 "Poor"
foreach var of varlist (RTHLTH1 RTHLTH2 RTHLTH3 RTHLTH4 RTHLTH5) {
	label values `var' health
}

foreach var of varlist (MNHLTH1 MNHLTH2 MNHLTH3 MNHLTH4 MNHLTH5 ) {
	replace `var'=. if `var'<=0
}
foreach var of varlist (MNHLTH1 MNHLTH2 MNHLTH3 MNHLTH4 MNHLTH5) {
	label values `var' health
}
}

*** indentifying spouses
qui{
sort DUID ID
order DUID PID SPOUID MARRY1 MARRY2 MARRY3 MARRY4 MARRY5

replace SPOUID=. if SPOUID>900 | SPOUID<0

gen temp_var=0 if SPOUID==.
replace temp_var=1 if SPOUID!=.

order DUID PID SPOUID MARRY1 MARRY2 MARRY3 MARRY4 MARRY5
sort temp_var YR_1_2 DUID PID 

gen spouse_educ=.
foreach num of numlist 1 2 3 4 5 {
gen spouse_age`num'=AGE`num'[_n-1] if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY`num'==1 | MARRY`num'==7) & SPOUID!=.
replace spouse_age`num'=AGE`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY`num'==1 | MARRY`num'==7) & SPOUID!=.


gen spouse_health`num' =RTHLTH`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY`num'==1 | MARRY`num'==7) & SPOUID!=.
replace spouse_health`num' =RTHLTH`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY`num'==1 | MARRY`num'==7) & SPOUID!=.

replace spouse_educ=EDUC[_n-1] if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY`num'==1 | MARRY`num'==7) & SPOUID!=.
replace spouse_educ=EDUC[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY`num'==1 | MARRY`num'==7) & SPOUID!=.
}

gen spouse_WAGE_YR1=WAGE_Y1[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
replace spouse_WAGE_YR1=WAGE_Y1[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
gen spouse_WAGE_YR2=WAGE_Y2[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.
replace spouse_WAGE_YR2=WAGE_Y2[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.


gen spouse_income_YR1=TOTALINC_Y1[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
replace spouse_income_YR1=TOTALINC_Y1[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
gen spouse_income_YR2=TOTALINC_Y2[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.
replace spouse_income_YR2=TOTALINC_Y2[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.

foreach var of varlist DIVINC_Y* TRINC_Y* INTINC_Y*{
replace `var'=0 if `var'==.
} 

gen spouse_int_income_YR1=DIVINC_Y1[_n-1] + TRINC_Y1[_n-1] + INTINC_Y1[_n-1] if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
replace spouse_int_income_YR1=DIVINC_Y1[_n+1] + TRINC_Y1[_n+1] + INTINC_Y1[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
gen spouse_int_income_YR2=DIVINC_Y2[_n-1] + TRINC_Y2[_n-1] + INTINC_Y2[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.
replace spouse_int_income_YR2=DIVINC_Y2[_n+1] + TRINC_Y2[_n+1] + INTINC_Y2[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.

gen int_income_YR1=DIVINC_Y1 + TRINC_Y1 + INTINC_Y1 
gen int_income_YR2=DIVINC_Y2 + TRINC_Y2 + INTINC_Y2  

*tab int_income_YR1 if int_income_YR1<100


gen spouse_TOTSLF_YR1=TOTSLF_Y1[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
replace spouse_TOTSLF_YR1=TOTSLF_Y1[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
gen spouse_TOTSLF_YR2=TOTSLF_Y2[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.
replace spouse_TOTSLF_YR2=TOTSLF_Y2[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.

gen spouse_TOTEXP_YR1=TOTEXP_Y1[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
replace spouse_TOTEXP_YR1=TOTEXP_Y1[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
gen spouse_TOTEXP_YR2=TOTEXP_Y2[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.
replace spouse_TOTEXP_YR2=TOTEXP_Y2[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.

gen spouse_TOTTCH_YR1=TOTTCH_Y1[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
replace spouse_TOTTCH_YR1=TOTTCH_Y1[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
gen spouse_TOTTCH_YR2=TOTTCH_Y2[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.
replace spouse_TOTTCH_YR2=TOTTCH_Y2[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.


gen spouse_RXEXP_YR1=RXEXP_Y1[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
replace spouse_RXEXP_YR1=RXEXP_Y1[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY1==1 | MARRY1==7 | MARRY2==1 | MARRY2==7) & SPOUID!=.
gen spouse_RXEXP_YR2=RXEXP_Y2[_n-1]  if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.
replace spouse_RXEXP_YR2=RXEXP_Y2[_n+1]  if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & (MARRY3==1 | MARRY3==7 | MARRY4==1 | MARRY4==7 | MARRY5==1 | MARRY5==7) & SPOUID!=.

}

* recoding some missings
qui{

foreach var of varlist (ADNERV* ADHOPE*  ADREST*  ADSAD*  ADEFRT*  ADWRTH*  ADINTR* ADDPRS*  ADCAPE*  ADNRGY*  ADDOWN*) {
	replace `var'=. if `var'<0
}

foreach var of varlist (INSCOV_Y1 INSCOV_Y2) {
	replace `var'=. if `var'<=0
}


foreach var of varlist (TEMPJB1 - TEMPJB5){       
replace `var'=. if `var'<0
replace `var'=0 if `var'==2
}

foreach var of varlist (CHOLDX3 - ARTHTX5){       
replace `var'=. if `var'<0
replace `var'=0 if `var'==2
}


foreach var of varlist (BEGRFD1-ENDRFY5){       
replace `var'=. if `var'<0
}

foreach var of varlist (spouse_income_YR1 spouse_income_YR2 spouse_WAGE_YR1 spouse_WAGE_YR2 ) {
	replace `var'=. if `var'<0
}

}

 
 *EDUCATION
 qui{
gen educ_type=.
replace educ_type=1 if EDUC<12 & EDUC!=.
replace educ_type=2 if EDUC==12 & EDUC!=.
replace educ_type=3 if EDUC>12 & EDUC<16 & EDUC!=.
replace educ_type=4 if  EDUC>=16 & EDUC!=.

gen spouse_educ_type=. 
replace spouse_educ_type= 1 if spouse_educ<12 & spouse_educ!=.
replace spouse_educ_type=2 if spouse_educ==12 & spouse_educ!=.
replace spouse_educ_type=3 if spouse_educ>12 & spouse_educ<16 & spouse_educ!=.
replace spouse_educ_type=4 if  spouse_educ>=16 & spouse_educ!=.


label define educ 1 "No High School Degree" 2 "High School Degree" 3 "Some College" 4 "College Degree (4+ yrs)"
label values educ_type educ
label values spouse_educ_type educ


* college variable
gen spouse_college = 0 if spouse_educ_type>=1 & spouse_educ_type<=3
replace spouse_college = 1 if spouse_educ_type==4

gen college=0 if educ_type>=1 & educ_type<=3
replace college=1 if educ_type==4


* education groups 
gen educ_group=1 if educ_type==1 | educ_type==2
replace educ_group =2 if educ_type==3
replace educ_group =3 if educ_type==4

label define edclab 1 "HS or Less" 2 "Some College" 3 "College"
label values educ_group edclab
label var educ_group "Education"

 }


*RACE
qui{
recode RACE (1=1) (2=2) (4 5 7 =3) (3 6 8=4)
replace RACE =. if RACE<=0
label define race 1 "White" 2 "Black" 3 "Asian or Pacific Islander" 4 "Other", replace
label values RACE race

* 2002 onwards, the meps variable gives:
*1 PERSON IS HISPANIC	
*2 PERSON IS BLACK-NO OTH RACE/NOT HISPANIC	
*3 PERSON IS ASIAN-NO OTH RACE/NOT HISPANIC	
*4 OTHER RACE/NOT HISPANIC

* ASIAN IS THERE ONLY FROM 2002 ONWARDS (INCLUSIVE)
* BEFORE THAT, asian and other (including whites) are category 3 
replace RACETHN=5 if  RACETHN==3 & YEAR<=2001
recode RACETHN (1=1) (3=2) (4 =3) (2 5=4) if YEAR>=2012

label define racethn_l 1 "Hispanic" 2 "Black" 3 "Asian" 4 "Other" 5 "Asian or other <=2001", replace
label values RACETHN racethn_l


gen RACE_sum = 2 if  RACETHN==2 //Black
replace RACE_sum = 3 if  RACETHN==1 & RACE_sum != 2 //Hispanic non-Black. 
replace RACE_sum = 1 if RACE==1 & RACE_sum ==. // White, non black, non-hispanic
replace RACE_sum = 4 if RACE!=. & RACE_sum ==. // other

* combine 1 and 4

replace RACE_sum =1 if RACE_sum == 4

label define racethn_l1 1 "White/Other" 2 "Black" 3 "Hispanic" 
label values RACE_sum racethn_l1
}


* marital status
foreach num of numlist 1 2 3 4 5 {
gen MAR`num'=0 if MARRY`num' >=1 &  MARRY`num'!=. // single 
replace MAR`num'=1 if MARRY`num'==1 | MARRY`num'==7 // married
}

*family size
qui{

foreach num of numlist 1 2 3 4 5 {
* identifying number of kids or other family members in family, other than self and spouse
gen fam_size`num'=FAMSZE`num'-1 if  MAR`num'==0 // SINGLE, just subtracting the individual, left with family members other than spouse. 
replace fam_size`num' = FAMSZE`num'-2 if MAR`num'==1 & FAMSZE`num'>=2 // MARRIED - subtracting individual + spouse
replace fam_size`num' = FAMSZE`num'-1 if MAR`num'==1 & FAMSZE`num'==1 // MARRIED, but the spouse not in family? 
gen fam_size1`num'= 1 if fam_size`num'==0 // 0 kids or other family members
replace  fam_size1`num'= 2 if fam_size`num'==1 // 1 kids or other family members
replace  fam_size1`num'= 3 if fam_size`num'==2 // 2 kids or other family members
replace  fam_size1`num'= 4 if fam_size`num'>2 & fam_size`num'!=. // 3+ kids or other family members
drop fam_size`num'
rename fam_size1`num' fam_size`num'
}

foreach var of varlist FAMSZE1 FAMSZE2 FAMSZE3 FAMSZE4 FAMSZE5 {
	replace `var'=. if `var'<=0
	replace `var'=4 if `var'>4 & `var'!=.
}
}


* BMI and health risks
qui{
 ** BMI round 3
 gen BMI_YR1 = 1 if BMINDX3>=0 & BMINDX3<=24.9 // Classifying BMI according to a benchmark http://www.cancer.org/cancer/cancercauses/dietandphysicalactivity/bodyweightandcancerrisk/body-weight-and-cancer-risk-adult-bmi
	replace BMI_YR1 = 2 if BMINDX3 >24.9 & BMINDX3 <=29.9 // The website used 25 or more and less than equal to 29.9 for overweight, but I chose to use greater than 24.9 in case there was someone with BMI "24.9xx"
	replace BMI_YR1 = 3 if BMINDX3 >29.9 & BMINDX3!=. // The website used 30 or morefor obese, but I chose to use greater than 29.9 in case there was someone with BMI "29.9xx"
	
	label define BMI2010 1 "BMI Normal or Low" 2 "BMI Overweight" 3 "BMI Obese"
	label values BMI_YR1 BMI2010
	
	
** BMI round 5
 gen BMI_YR2 = 1 if BMINDX5>=0 & BMINDX5<=24.9 // Classifying BMI according to a benchmark http://www.cancer.org/cancer/cancercauses/dietandphysicalactivity/bodyweightandcancerrisk/body-weight-and-cancer-risk-adult-bmi
	replace BMI_YR2 = 2 if BMINDX5 >24.9 & BMINDX5 <=29.9 // The website used 25 or more and less than equal to 29.9 for overweight, but I chose to use greater than 24.9 in case there was someone with BMI "24.9xx"
	replace BMI_YR2 = 3 if BMINDX5 >29.9 & BMINDX5!=. // The website used 30 or morefor obese, but I chose to use greater than 29.9 in case there was someone with BMI "29.9xx"
	label values BMI_YR2 BMI2010
	tab BMI_YR2
	
	
	foreach var of varlist (ADRISK2 - ADRISK4){       
replace `var'=. if `var'<0
}
}	
	
*  HEALTH BEHAVIORS 
qui{
replace SEATBE3=. if SEATBE3<1 | SEATBE3==6
replace SEATBE5=. if SEATBE5<1 | SEATBE5==6

* preventive care
replace FLUSHT3=. if FLUSHT3<1
replace CHECK3=. if CHECK3<1

replace FLUSHT5=. if FLUSHT5<1
replace CHECK5=. if CHECK5<1

* SMOKING AND EXERCISE
foreach var of varlist ADSMOK2 PHYACT3 ADSMOK4 PHYACT5{
	replace `var'=. if `var'<=0
	replace `var'=0 if `var'==2
}
*make exercise equal to 1 if does not exercise
gen exercise_YR1=0 if PHYACT3==1
replace exercise_YR1=1 if PHYACT3==0

gen exercise_YR2=0 if PHYACT5==1
replace exercise_YR2=1 if PHYACT5==0
}

 

rename YEAR year

***  INSURANCE 
qui{
foreach var of varlist (OFFER1 OFFER2 OFFER3 OFFER4 OFFER5 HELD1 HELD2 HELD3 HELD4 HELD5 PRIVAT1 PRIVAT2 PRIVAT3 PRIVAT4 PRIVAT5 UNION1 UNION2 UNION3 UNION4 UNION5 MCAID1 MCAID2 MCAID3 MCAID4 MCAID5 MCARE1 MCARE2 MCARE3 MCARE4 MCARE5) {
	replace `var'=. if `var'<=0
	replace `var'=0 if `var'==2
}


 foreach num  of numlist 1 2 3 4 5 {
 gen HELD_Orig`num'=HELD`num'
  replace HELD`num'=. if HELD`num'==0 & PRIVAT`num'==1
 }
 
 
 label var PRIVAT1 "Private HI"
label var HELD_Orig1 "Has ESHI"

label var INSCOV_Y1 "HI"
label var INSCOV_Y2 "HI"
label define ins3 1 "Private" 2 "Public" 3 "Uninsured"
label values INSCOV_Y1  ins3
label values INSCOV_Y2  ins3

}


* IND AND OCC VARIABLES 
qui{
//tab INDCAT1 if year==2001
//tab INDCAT4 if year==2001
//tab INDCAT5 if year==2001
//tab INDCAT5 if year==2002
***** INDUSTRY GROUP IN 2001
replace INDCAT1=1 if CIND1==1 & year==2001
replace INDCAT1=2 if CIND1==2 & year==2001
replace INDCAT1=3 if CIND1==3 & year==2001
replace INDCAT1=4 if CIND1==4 & year==2001
replace INDCAT1=6 if CIND1==5 & year==2001
replace INDCAT1=5 if CIND1==6 & year==2001
replace INDCAT1=8 if CIND1==7 & year==2001
replace INDCAT1=12 if (CIND1==8 | CIND1==9) & year==2001
replace INDCAT1=11 if CIND1==10 & year==2001
replace INDCAT1=9 if CIND1==11 & year==2001
replace INDCAT1=13 if CIND1==12 & year==2001
replace INDCAT1=14 if CIND1==14 & year==2001 

***** OCC GROUP IN 2001
replace OCCCAT1=1 if (COCCP1==2 | COCCP1==10) & year==2001
replace OCCCAT1=2 if COCCP1==1 & year==2001
replace OCCCAT1=3 if COCCP1==8 & year==2001
replace OCCCAT1=4 if COCCP1==3 & year==2001
replace OCCCAT1=5 if COCCP1==4 & year==2001
replace OCCCAT1=6 if COCCP1==11 & year==2001
replace OCCCAT1=7 if (COCCP1==5 | COCCP1==9) & year==2001
replace OCCCAT1=8 if (COCCP1==7 | COCCP1==6) & year==2001
replace OCCCAT1=9 if COCCP1==13 & year==2001

tab OCCCAT1 if year==2001
tab OCCCAT1 if year==2002
*tab INDCAT1 if year==2001
*tab INDCAT1 if year==2002
foreach num of numlist 1 2 3 4 5 {
replace INDCAT`num' = . if INDCAT`num'==-1 | INDCAT`num'==-9
replace OCCCAT`num' = . if OCCCAT`num'==-1 | OCCCAT`num'==-9
}
replace INDCAT2=INDCAT1 if INDCAT2==-2
replace INDCAT3=INDCAT2 if INDCAT3==-2
replace INDCAT4=INDCAT3 if INDCAT4==-2
replace INDCAT5=INDCAT4 if INDCAT5==-2

replace OCCCAT2=OCCCAT1 if OCCCAT2==-2
replace OCCCAT3=OCCCAT2 if OCCCAT3==-2
replace OCCCAT4=OCCCAT3 if OCCCAT4==-2
replace OCCCAT5=OCCCAT4 if OCCCAT5==-2
}


* EMPLOYMENT STATUS
qui{
	
	 * TRANSITIONS TO UNEMPLOYMENT 
gen EMP_TRANS = 0 if EMPST1>=1 & EMPST1<=3 & EMPST5>=1 & EMPST5<=3 
replace  EMP_TRANS = 1 if EMPST1>=1 & EMPST1<=3 & (EMPST5==4 | EMPST5==-1)



foreach num of numlist 1 2 3 4 5 {
gen EMP`num'=0 if (EMPST`num'==2 | EMPST`num'==4 ) | ((EMPST`num'==1 | EMPST`num'==3 |  EMPST`num'==-1) & (HOUR`num'<12.31 & HOUR`num'!=.)) // NOT EMPLOYED
replace EMP`num'=1 if  (EMPST`num'==1 | EMPST`num'==3 ) &  HOUR`num'>=12.31 & HOUR`num'<32.31  // EMPLOYED PT
replace EMP`num'=2 if  (EMPST`num'==1 | EMPST`num'==3 ) &  HOUR`num'>=32.31 & HOUR`num'!=.  // EMPLOYED FT
gen labor_force`num'=5 if EMP`num'==2 & HELD_Orig`num' ==1  //ft with insurance
replace labor_force`num'=4 if EMP`num'==2 &  (HELD_Orig`num' ==. | HELD_Orig`num' ==0)   //ft no insurance
replace labor_force`num'=3 if EMP`num'==1 &  HELD_Orig`num' ==1   //pt with insurance
replace labor_force`num'=2 if EMP`num'==1 & (HELD_Orig`num' ==. | HELD_Orig`num' ==0)   //pt no insurance
replace labor_force`num'=1 if EMP`num'==0                //not working
}
*

foreach num of numlist 1 2 3 4 5 {
gen LF`num'=labor_force`num'
replace LF`num'=1 if EMPST`num'==-1 // inapplicable
}
*

label define lfs 1 "Not Employed" 2 "PT no ESHI" 3 "PT w/ ESHI" 4 "FT no ESHI" 5 "FT w/ ESHI"
label values labor_force* lfs

}


*** more variables for spouses
qui{
sort temp_var YR_1_2 DUID PID 

* variables
gen spouse_RACE =RACE[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_RACE=RACE[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1


foreach num of numlist 1 2 3 4 5 {

gen spouse_EMP`num' =EMP`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_EMP`num' =EMP`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_EMPST`num' =EMPST`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_EMPST`num' =EMPST`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_HOUR`num' =HOUR`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_HOUR`num' =HOUR`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_INDCAT`num' =INDCAT`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_INDCAT`num' =INDCAT`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_PRIVAT`num' =PRIVAT`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_PRIVAT`num' =PRIVAT`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_OFFER`num' =OFFER`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_OFFER`num' =OFFER`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_HELD_Orig`num' =HELD_Orig`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_HELD_Orig`num' =HELD_Orig`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_MCARE`num' =MCARE`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_MCARE`num' =MCARE`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_MCAID`num' =MCAID`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_MCAID`num' =MCAID`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1
}


foreach num of numlist  2 4  {

gen spouse_ADSMOK`num' =ADSMOK`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_ADSMOK`num' =ADSMOK`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1


}
/*
foreach num of numlist  3 5  {

gen spouse_TC`num' =TC`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_TC`num' =TC`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_FLUSHT`num' =FLUSHT`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_FLUSHT`num' =FLUSHT`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_CHECK`num' =CHECK`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_CHECK`num' =CHECK`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1


}


foreach num of numlist  1 2  {

gen spouse_exercise_YR`num' =exercise_YR`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_exercise_YR`num' =exercise_YR`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_BMI_YR`num' =BMI_YR`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_BMI_YR`num' =BMI_YR`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_SSIDIS_YR`num' =SSIDIS_Y`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_SSIDIS_YR`num' =SSIDIS_Y`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1

gen spouse_INSCOV_YR`num' =INSCOV_Y`num'[_n-1]   if DUID==DUID[_n-1] & PID==SPOUID[_n-1] & temp_var==1
replace spouse_INSCOV_YR`num' =INSCOV_Y`num'[_n+1] if DUID==DUID[_n+1] & PID==SPOUID[_n+1] & temp_var==1
}
*/
}


//rename RTHLTH1 RTHLTH_YR1
//rename RTHLTH5 RTHLTH_YR2
//rename MNHLTH1 MNHLTH_YR1
//rename MNHLTH5 MNHLTH_YR2


* demographics
qui{
	
gen INSCOP_Y1=1 if INSCOP1==1 & INSCOP2==1 & INSCOP3==1
gen INSCOP_Y2=1 if INSCOP3==1 & INSCOP4==1 & INSCOP5==1


*1=MALE, 2=FEM

label define MF 1 "Men" 2 "Women"
label values SEX MF


**** COHORTS
gen cohort=1 if DOBYY>=1985 & DOBYY!=.
replace cohort=2 if DOBYY>=1975 & DOBYY<=1984
replace cohort=3 if DOBYY>=1965 & DOBYY<=1974
replace cohort=4 if DOBYY>=1955 & DOBYY<=1964
replace cohort=5 if  DOBYY<=1954


* AGE AND AGE GROUPS
qui{
gen AGE_YR1=AGE1
gen AGE_SQ_YR1=AGE1^2
gen  AGE_YR2=AGE1+1
gen AGE_SQ_YR2=AGE2^2

gen AGE1_sq=AGE1^2
gen AGE1_cub=AGE1^3


* age gruops

gen age_group2=20 if AGE_YR2<25 & AGE_YR2>=20
replace age_group2=25 if AGE_YR2<30 & AGE_YR2>=25
replace age_group2=30 if  AGE_YR2<35 & AGE_YR2>=30
replace age_group2=35 if  AGE_YR2<40 & AGE_YR2>=35
replace age_group2=40 if  AGE_YR2<45 & AGE_YR2>=40
replace age_group2=45 if  AGE_YR2<50 & AGE_YR2>=45
replace age_group2=50 if  AGE_YR2<55 & AGE_YR2>=50
replace age_group2=55 if  AGE_YR2<60 & AGE_YR2>=55
replace age_group2=60 if  AGE_YR2<65 & AGE_YR2>=60
replace age_group2=65 if  AGE_YR2<70 & AGE_YR2>=65

gen age_group3=51 if AGE_YR1<53 & AGE_YR2>=50
replace age_group3=54 if AGE_YR1<56 & AGE_YR2>=53
replace age_group3=57 if  AGE_YR1<59 & AGE_YR2>=56
replace age_group3=60 if  AGE_YR1<62 & AGE_YR2>=59
replace age_group3=63 if  AGE_YR1<65 & AGE_YR2>=62
}



label var MAR1 "Married R1"
label var MAR2 "Married R2"
label var MAR3 "Married R3"
label var MAR4 "Married R4"
label var MAR5 "Married R5"
}


* min wages
qui{
gen min_wage=.
replace min_wage=5.15 if year>=1997 & year<=2006
replace min_wage=5.85 if year==2007
replace min_wage=6.55 if year==2008
replace min_wage=7.25 if year==2009
replace min_wage=7.25 if year>=2010 & year!=.
}

* CPI
qui{
// https://www.minneapolisfed.org/about-us/monetary-policy/inflation-calculator/consumer-price-index-1913-
gen CPI=.
replace	CPI	=	156.9	if 	year==	1996
replace	CPI	=	160.5	if 	year==	1997
replace	CPI	=	163	if 	year==	1998
replace	CPI	=	166.6	if 	year==	1999
replace	CPI	=	172.2	if 	year==	2000
replace	CPI	=	177.1	if 	year==	2001
replace	CPI	=	179.9	if 	year==	2002
replace	CPI	=	184	if 	year==	2003
replace	CPI	=	188.9	if 	year==	2004
replace	CPI	=	195.3	if 	year==	2005
replace	CPI	=	201.6	if 	year==	2006
replace	CPI	=	207.342	if 	year==	2007
replace	CPI	=	215.303	if 	year==	2008
replace	CPI	=	214.537	if 	year==	2009
replace	CPI	=	218.056	if 	year==	2010
replace	CPI	=	224.939	if 	year==	2011
replace	CPI	=	229.594	if 	year==	2012
replace	CPI	=	232.957	if 	year==	2013
replace	CPI	=	236.736	if 	year==	2014
replace	CPI	=	237.017	if 	year==	2015
replace	CPI	=	240.007	if 	year==	2016
replace	CPI	=   245.12  if 	year==	2017
replace	CPI	=   251.107 if 	year==	2018
replace	CPI	=   255.7 if 	year==	2019
replace	CPI	=   258.8 if 	year==	2020

gen CPI2=. //this is from next year
replace	CPI2	=	160.5	if 	year==	1996
replace	CPI2	=	163	if 	year==	1997
replace	CPI2	=	166.6	if 	year==	1998
replace	CPI2	=	172.2	if 	year==	1999
replace	CPI2	=	177.1	if 	year==	2000
replace	CPI2	=	179.9	if 	year==	2001
replace	CPI2	=	184	if 	year==	2002
replace	CPI2	=	188.9	if 	year==	2003
replace	CPI2	=	195.3	if 	year==	2004
replace	CPI2	=	201.6	if 	year==	2005
replace	CPI2	=	207.342	if 	year==	2006
replace	CPI2	=	215.303	if 	year==	2007
replace	CPI2	=	214.537	if 	year==	2008
replace	CPI2	=	218.056	if 	year==	2009
replace	CPI2	=	224.939	if 	year==	2010
replace	CPI2	=	229.594	if 	year==	2011
replace	CPI2	=	232.957	if 	year==	2012
replace	CPI2	=	236.736	if 	year==	2013
replace	CPI2	=	237.017	if 	year==	2014
replace	CPI2	=	240.007	if 	year==	2015
replace	CPI2	=	245.12	if 	year==	2016
replace	CPI2	=	251.107	if 	year==	2017
replace	CPI2	=	255.657	if 	year==	2018
replace	CPI2	=	258.8	if 	year==	2019
replace	CPI2	=	271.0	if 	year==	2020

}


*CONVERT TO 2010 DOLLARS and to thousands of dollars.
qui{
replace CPI = CPI/218.056*100
foreach var of varlist  TOTEXP_Y1 TOTSLF_Y1 TOTTCH_Y1 RXEXP_Y1 TOTALINC_Y1  FAMINC_Y1 spouse_TOTSLF_YR1 spouse_TOTEXP_YR1 spouse_TOTTCH_YR1 spouse_RXEXP_YR1 spouse_WAGE_YR1 spouse_income_YR1 {
replace `var'=(`var'*100/CPI)/1000
}

replace CPI2 = CPI2/218.056*100
foreach var of varlist  TOTEXP_Y2 TOTSLF_Y2 TOTTCH_Y2 RXEXP_Y2 TOTALINC_Y2  FAMINC_Y2 spouse_TOTSLF_YR2 spouse_TOTEXP_YR2 spouse_TOTTCH_YR2 spouse_RXEXP_YR2 spouse_WAGE_YR2 spouse_income_YR2 {
replace `var'=(`var'*100/CPI2)/1000
}


foreach var of varlist min_wage WAGE_Y1  {
replace `var'=(`var'*100/CPI)
}

foreach var of varlist WAGE_Y2  {
replace `var'=(`var'*100/CPI2)
}

gen TOTALINC_Y1_sq=TOTALINC_Y1^2 
label var TOTALINC_Y1 "Income"
label var TOTALINC_Y1_sq "Income sq."
}


* Medical charges and expenditures
qui{
rename TOTTCH_Y1 Charges_excl_RX
rename TOTEXP_Y1 Expenditures_incl_RX
rename RXEXP_Y1 RX
gen Expenditures_excl_RX = Expenditures_incl_RX-RX
gen Charges_plus_RX = Charges_excl_RX + RX

rename TOTTCH_Y2 Charges_excl_RX_2
rename TOTEXP_Y2 Expenditures_incl_RX_2
rename RXEXP_Y2 RX_2
gen Expenditures_excl_RX_2 = Expenditures_incl_RX_2-RX_2
gen Charges_plus_RX_2 = Charges_excl_RX_2 + RX_2
}


* consider measures of employment and insurance that take into account the length of each round 
qui{

**** for employment, we will often need to limit to those employed for the full year
* first, we need to determine if Round 3 lies mainly in year 1 or 2
* then, we construct full year employment variables

gen yr1=ENDRFY2-BEGRFY1 
gen yr5=ENDRFY3-BEGRFY3

gen count_3_1=0 if yr1==1 /* if round 2 ends the following year, then round 3 is definitely in the next, but this actualy never happens in the data */
gen count_3_2=1 if yr1==1
replace count_3_1=0 if yr5==1 & (ENDRFM3/(12-ENDRFM2+ENDRFM3))>=0.5
replace count_3_2=1 if yr5==1 & (ENDRFM3/(12-ENDRFM2+ENDRFM3))>=0.5
replace count_3_1=1 if yr5==1 & (ENDRFM3/(12-ENDRFM2+ENDRFM3))<0.5 // count in year 1 if the number of months in year 2 is less than half of the total number of months of R3. (so most of the round length is in year 1)
replace count_3_2=0 if yr5==1 & (ENDRFM3/(12-ENDRFM2+ENDRFM3))<0.5


* the day of the round start and end are often missing but the month and year are there. 
* so if the month is there, we impute the day as the 15th of the month
replace ENDRFD1 =15 if ENDRFD1==. &  ENDRFM1!=.


* GENERATE ROUND LENGTH AS FRACTION OF FIRST YEAR AND SECOND YEAR
*30.4 DAYS PER MONTH
gen R_1_days= ENDRFD1 if ENDRFM1==1
replace  R_1_days= 30.4* (ENDRFM1-1) + ENDRFD1 if ENDRFM1!=1
replace BEGRFD2=15 if BEGRFD2==. & BEGRFM2!=.
replace ENDRFD2=15 if ENDRFD2==. & ENDRFM2!=.
replace BEGRFD3=15 if BEGRFD3==. & BEGRFM3!=.
replace ENDRFD3=15 if ENDRFD3==. & ENDRFM3!=.

replace BEGRFD4=15 if BEGRFD4==. & BEGRFM4!=.
replace ENDRFD4=15 if ENDRFD4==. & ENDRFM4!=.

replace BEGRFD5=15 if BEGRFD5==. & BEGRFM5!=.

* R2 never ends in year 2. 
gen R_2_days= (30.4-BEGRFD2) if BEGRFM2==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 if (BEGRFM2+1)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4 if (BEGRFM2+2)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4*2 if (BEGRFM2+3)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4*3 if (BEGRFM2+4)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4*4 if (BEGRFM2+5)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4*5 if (BEGRFM2+6)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4*6 if (BEGRFM2+7)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4*7 if (BEGRFM2+8)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4*8 if (BEGRFM2+9)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4*9 if (BEGRFM2+10)==ENDRFM2
replace  R_2_days= (30.4-BEGRFD2) + ENDRFD2 + 30.4*10 if (BEGRFM2+11)==ENDRFM2


gen R_3_days_yr1=( 30.4-BEGRFD3 )+ 30.4*(12-BEGRFM3)  if yr5==1 & BEGRFM3!=. & ENDRFM3!=.  
replace R_3_days_yr1=. if R_3_days_yr1<0

gen total_y1 = R_1_days+R_2_days+R_3_days_yr1
tab total_y1 

gen R_3_days_yr2= ENDRFD3 if ENDRFM3==1
replace  R_3_days_yr2= 30.4* (ENDRFM3-1) + ENDRFD3 if ENDRFM3!=1


gen R_4_days= (30.4-BEGRFD4) if BEGRFM4==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 if (BEGRFM4+1)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4 if (BEGRFM4+2)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4*2 if (BEGRFM4+3)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4*3 if (BEGRFM4+4)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4*4 if (BEGRFM4+5)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4*5 if (BEGRFM4+6)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4*6 if (BEGRFM4+7)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4*7 if (BEGRFM4+8)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4*8 if (BEGRFM4+9)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4*9 if (BEGRFM4+10)==ENDRFM4
replace  R_4_days= (30.4-BEGRFD4) + ENDRFD4 + 30.4*10 if (BEGRFM4+11)==ENDRFM4

gen yr_R3=ENDRFY5- BEGRFY5
tab yr_R3

gen R_5_days= (12-BEGRFM5)*30.4 +   (30.4-BEGRFD5) 
gen total_y2=R_3_days_yr2+R_4_days+R_5_days 

*** construct hours worked per year. 
gen R1_fr=R_1_days /total_y1 if total_y1>=364 & total_y1<366
gen R2_fr=R_2_days /total_y1 if total_y1>=364 & total_y1<366
gen R3_fr_yr1=R_3_days_yr1 /total_y1 if total_y1>=364 & total_y1<366

gen R3_fr_yr2=R_3_days_yr2/total_y2 if total_y2>=364 & total_y2<366
gen R4_fr=R_4_days /total_y2 if total_y2>=364 & total_y2<366
gen R5_fr=R_5_days /total_y2 if total_y2>=364 & total_y2<366

foreach var of varlist (HOUR1- HOUR5) {
replace `var'=0 if `var'==-1
replace `var'=. if  `var'<0
}

gen HOURS_Y1 = R1_fr*HOUR1 + R2_fr*HOUR2 + R3_fr_yr1*HOUR3
gen HOURS_Y2 = R3_fr_yr2*HOUR3 +  R4_fr*HOUR4 + R5_fr* HOUR5

replace HOURS_Y1=HOURS_Y1*52.143
replace HOURS_Y2=HOURS_Y2*52.143 

* insurance held through own job
gen HELD_Y1 = R1_fr*HELD_Orig1 + R2_fr*HELD_Orig2 + R3_fr_yr1*HELD_Orig3
gen HELD_Y2 = R3_fr_yr2*HELD_Orig3 +  R4_fr*HELD_Orig4 + R5_fr* HELD_Orig5
tab HELD_Y1

gen Mcaid_Y1 = R1_fr*MCAID1 + R2_fr*MCAID2 + R3_fr_yr1*MCAID3
gen Mcaid_Y2 = R3_fr_yr2*MCAID3 + R4_fr*MCAID4 + R5_fr*MCAID5

foreach num of numlist 1 2  {
replace Mcaid_Y`num' =0 if 	Mcaid_Y`num'<0.5 
replace Mcaid_Y`num' =1 if 	Mcaid_Y`num'>=0.5  & Mcaid_Y`num'!=.
}

foreach num of numlist 1 2  {
gen EMP_Y`num' =0 if HOURS_Y`num'<520
replace EMP_Y`num'=1 if HOURS_Y`num'>=520 &  HOURS_Y`num'<=1500 // EMPLOYED PT
replace EMP_Y`num'=2 if   HOURS_Y`num'>1500  & HOURS_Y`num'!=.  // EMPLOYED FT

gen labor_force_Y`num'=5 if EMP_Y`num'==2 & HELD_Y`num'>=0.5  & HELD_Y`num'!=.  //ft with insurance -- have to have insurance both rounds
replace labor_force_Y`num'=4 if EMP_Y`num'==2 & HELD_Y`num'<0.5  //ft no insurance
replace labor_force_Y`num'=3 if EMP_Y`num'==1 & HELD_Y`num'>=0.5  & HELD_Y`num'!=.  //pt with insurance -- had insurance in at least 1 or 2
replace labor_force_Y`num'=2 if EMP_Y`num'==1 & HELD_Y`num'<0.5   //pt no insurance
replace labor_force_Y`num'=1 if EMP_Y`num'==0                //not working
}


* classify emp and non-emp as (NE, PT) and (FT) 
foreach num of numlist 1 2  {
gen EMPLOYMENT_YR`num'_v1 =0 if EMP_Y`num'==0 | EMP_Y`num'==1
replace EMPLOYMENT_YR`num'_v1 =1 if EMP_Y`num'==2
}


* classify emp and non-emp as (NE, PT, FT NO INS) and (FT WITH INSURANCE)
foreach num of numlist 1 2  {
gen EMPLOYMENT_YR`num'_v2 =0 if labor_force_Y`num'<5 & labor_force_Y`num'!=.
replace EMPLOYMENT_YR`num'_v2 =1 if labor_force_Y`num'==5
}

label define emp_v1 0 "No" 1 "Yes"
label values EMPLOYMENT_YR1_v1 emp_v1
label values EMPLOYMENT_YR2_v1 emp_v1
label values EMPLOYMENT_YR1_v2 emp_v1
label values EMPLOYMENT_YR2_v2 emp_v1	
	
	
	
}


* insurance - alternative measures
qui{
gen PRIVATE_HI_Y1 =. // we leave this missing for non-workers with ESHI
replace PRIVATE_HI_Y1= 0 if (labor_force_Y1== 1 | labor_force_Y1==2 | labor_force_Y1==4)
replace PRIVATE_HI_Y1= 1 if (labor_force_Y1== 3 | labor_force_Y1==5) 


gen PRIVATE_HI_Y2 =. // we leave this missing for non-workers with ESHI
replace PRIVATE_HI_Y2= 0 if (labor_force_Y2== 1 | labor_force_Y2==2 | labor_force_Y2==4)
replace PRIVATE_HI_Y2= 1 if (labor_force_Y2== 3 | labor_force_Y2==5) 

label var PRIVATE_HI_Y1 "Has Private HI"
label var PRIVATE_HI_Y2 "Has Private HI"
label define ins4 0 "No ESHI" 1 "Has ESHI"
label values PRIVATE_HI_Y1  ins4
label values PRIVATE_HI_Y2  ins4

gen INSCOV_Y1_alt = 1 if PRIVATE_HI_Y1== 1 // has private
replace INSCOV_Y1_alt = 2 if Mcaid_Y1==1 //medicaid
replace INSCOV_Y1_alt = 3 if INSCOV_Y1==3 // uninsured

gen INSCOV_Y2_alt = 1 if PRIVATE_HI_Y2== 1 // has private
replace INSCOV_Y2_alt = 2 if Mcaid_Y2==1 //medicaid
replace INSCOV_Y2_alt = 3 if INSCOV_Y2==3 // uninsured

*label define ins3 1 "Private" 2 "Public" 3 "Uninsured"
label values INSCOV_Y1_alt  ins3
label values INSCOV_Y2_alt  ins3

}


* correct wages 
foreach num of numlist 1 2 3 4 5 {
bysort year: egen max_wage`num'=max(HRWG`num')
replace HRWG`num'=max_wage`num' if HRWG`num'==-10
drop max_wage`num'
replace HRWG`num'=. if HRWG`num'<0
replace HRWG`num'=HRWG`num'*100/CPI if  (`num'==1 | `num'==2 | (`num'==3 & count_3_1==1) )  // cpi adjust 
replace HRWG`num'=HRWG`num'*100/CPI2 if  (`num'==4 | `num'==5 | (`num'==3 & count_3_2==1) )  // cpi adjust 
replace HRWG`num'=. if HRWG`num' <min_wage/2  // replace to missing if less than half the minimum wage in that year
} 
*

* more spouse variables 
qui{
* spouse's labor force status in year 1
gen spouse_LF1=0 if (spouse_EMPST1==2 | spouse_EMPST1==4 ) | ((spouse_EMPST1==1 | spouse_EMPST1==3 |  spouse_EMPST1==-1) & (spouse_HOUR1<=20 & spouse_HOUR1!=.)) // NOT EMPLOYED
replace spouse_LF1=1 if   (spouse_EMPST1==1 | spouse_EMPST1==3 ) &  spouse_HOUR1>20 & spouse_HOUR1!=.  // EMPLOYED FT

gen spouse_LF2=0 if (spouse_EMPST2==2 | spouse_EMPST2==4 ) | ((spouse_EMPST2==1 | spouse_EMPST2==3 |  spouse_EMPST2==-1) & (spouse_HOUR2<=20 & spouse_HOUR2!=.)) // NOT EMPLOYED
replace spouse_LF2=1 if   (spouse_EMPST2==1 | spouse_EMPST2==3 ) &  spouse_HOUR2>20 & spouse_HOUR2!=.  // EMPLOYED FT

gen spouse_LF3=0 if (spouse_EMPST3==2 | spouse_EMPST3==4 ) | ((spouse_EMPST3==1 | spouse_EMPST3==3 |  spouse_EMPST3==-1) & (spouse_HOUR3<=20 & spouse_HOUR3!=.)) // NOT EMPLOYED
replace spouse_LF3=1 if   (spouse_EMPST3==1 | spouse_EMPST3==3 ) &  spouse_HOUR3>20 & spouse_HOUR3!=.  // EMPLOYED FT

gen spouse_LF_Y1=0 if (spouse_LF1==0 | spouse_LF2==0) & count_3_1==0
replace spouse_LF_Y1=0 if (spouse_LF1==0 | spouse_LF2==0  | spouse_LF3==0) & count_3_1==1
replace spouse_LF_Y1=1 if spouse_LF1==1 & spouse_LF2==1 & spouse_LF3==1 & count_3_1==1
replace spouse_LF_Y1=1 if spouse_LF1==1 & spouse_LF2==1 & count_3_1==0

label var spouse_LF_Y1 "Spouse Works"
label values spouse_LF_Y1 emp_v1	
}

sort ID year
save "${data}\Consolidated_data_reshaped.dta", replace
}